# import cleaned data
s44 <- read.csv("//Users/khushir/Documents/Class_Notes/Time Series/Project/store_44.csv", stringsAsFactors = FALSE)
s45 <- read.csv("//Users/khushir/Documents/Class_Notes/Time Series/Project/store_45.csv", stringsAsFactors = FALSE)
s47 <- read.csv("//Users/khushir/Documents/Class_Notes/Time Series/Project/store_47.csv", stringsAsFactors = FALSE)

# drop unnecessary columns
s44 <- s44[ , c("date","sales"), drop = FALSE]
s45 <- s45[ , c("date","sales"), drop = FALSE]
s47 <- s47[ , c("date","sales"), drop = FALSE]
library(stats)
library(TSA)
## 
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method       from
##   fitted.Arima TSA 
##   plot.Arima   TSA
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
s44$date <- as.Date(s44$date, format="%Y-%m-%d")
s45$date <- as.Date(s45$date, format="%Y-%m-%d")
s47$date <- as.Date(s47$date, format="%Y-%m-%d")
# splitting test and train

train44 <- s44[1:1674, ]
test44 <- s44[(nrow(s44) - 13):nrow(s44), ]

train45 <- s45[1:1674, ]
test45 <- s45[(nrow(s45) - 13):nrow(s45), ]

train47 <- s47[1:1674, ]
test47 <- s47[(nrow(s47) - 13):nrow(s47), ]

Store 44

train44.ts <- ts(train44$sales, frequency = 365)
test44.ts <- ts(test44$sales, frequency = 365)

# plot the time series
plot(train44, type = "l", main = "Time Series Plot of train data (Store 44)", xlab = "Date", ylab = "Sales")

# Plot ACF
acf(train44$sales, main = "ACF of Sales, train data (Store 44)", ylab = "Autocorrelation", xlab = "Lag")

# Plot PACF 
pacf(train44$sales, main='PACF of Sales, train data (Store 44)')

# applying SARIMA(pdq)(PDQ)

sarima44 <- auto.arima(train44$sales, seasonal = TRUE, trace = TRUE, approximation = TRUE)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,1,2) with drift         : 35575.12
##  ARIMA(0,1,0) with drift         : 36378.9
##  ARIMA(1,1,0) with drift         : 36344.69
##  ARIMA(0,1,1) with drift         : 35950.38
##  ARIMA(0,1,0)                    : 36376.9
##  ARIMA(1,1,2) with drift         : 35599.28
##  ARIMA(2,1,1) with drift         : 35593.01
##  ARIMA(3,1,2) with drift         : 35565.19
##  ARIMA(3,1,1) with drift         : 35583.49
##  ARIMA(4,1,2) with drift         : Inf
##  ARIMA(3,1,3) with drift         : Inf
##  ARIMA(2,1,3) with drift         : 35461.12
##  ARIMA(1,1,3) with drift         : 35569.39
##  ARIMA(2,1,4) with drift         : 35533.73
##  ARIMA(1,1,4) with drift         : 35533.85
##  ARIMA(3,1,4) with drift         : Inf
##  ARIMA(2,1,3)                    : 35459.55
##  ARIMA(1,1,3)                    : 35567.4
##  ARIMA(2,1,2)                    : 35573.6
##  ARIMA(3,1,3)                    : Inf
##  ARIMA(2,1,4)                    : 35531.88
##  ARIMA(1,1,2)                    : 35597.74
##  ARIMA(1,1,4)                    : 35531.91
##  ARIMA(3,1,2)                    : 35563.56
##  ARIMA(3,1,4)                    : Inf
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(2,1,3)                    : 35479.18
## 
##  Best model: ARIMA(2,1,3)
summary(sarima44)
## Series: train44$sales 
## ARIMA(2,1,3) 
## 
## Coefficients:
##           ar1     ar2      ma1     ma2      ma3
##       -0.4357  -0.826  -0.1612  0.1283  -0.7296
## s.e.   0.0181   0.041   0.0345  0.0640   0.0282
## 
## sigma^2 = 94414628:  log likelihood = -17733.56
## AIC=35479.13   AICc=35479.18   BIC=35511.66
## 
## Training set error measures:
##                    ME    RMSE      MAE  MPE MAPE      MASE      ACF1
## Training set 166.8022 9699.29 7602.904 -Inf  Inf 0.7602784 0.1542414
checkresiduals(sarima44)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,3)
## Q* = 942.38, df = 5, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 10
# Forecasting
forecast44 <- forecast(sarima44, h = 14)
plot(forecast44, main = "SARIMA Forecast from 2nd to 15th Aug, 2017 (Store 44)", xlab = "day", ylab = "Sales")
abline(h = mean(train44$sales), col = "red", lty = 2)

# Plot the test values
plot(test44, type = "l", col = "black", lwd = 2, ylim = range(c(test44.ts, forecast44$mean)), 
     main = "Comparison of Forecasted and Actual Values (Store 44)", xlab = "Day", ylab = "Sales")

# Add the forecasted values to the plot
# lines(x = seq(length(test44.ts) + 1, length(test44.ts) + length(forecast44$mean)),
#       y = forecast44$mean, col = "blue", lwd = 2)
lines(x = test44$date,
      y = forecast44$mean, col = "blue", lwd = 2)

# Add legend
legend("topright", legend = c("Actual", "Forecasted"), col = c("black", "blue"), lty = 1, lwd = 2)

# Add mean of training data as a reference line
abline(h = mean(train44$sales), col = "red", lty = 2)

# Plot the test values
plot(test44, type = "l", col = "black", lwd = 2, ylim = range(c(test44, forecast44$upper)), 
     main = "Comparison of Forecasted and Actual Values (Store 44)", xlab = "Day", ylab = "Sales")

# Add the forecasted values to the plot
lines(x = test44$date, y = forecast44$mean, col = "orange", lwd = 2)
lines(x = test44$date, y = forecast44$upper[1:14], col = "blue", lwd = 2)
lines(x = test44$date, y = forecast44$lower[1:14], col = "purple", lwd = 2)

# Add legend
legend("topright", legend = c("Actual", "Forecasted Mean", "Forecasted Upper", "Forecasted Lower"), col = c("black",  "orange", "blue", "purple"), lty = 1, lwd = 2)

abline(h = mean(train44$sales), col = "red", lty = 2)

# Install and load the Metrics package if you haven't already
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
## 
##     accuracy
# Calculate RMSE
rmse44 <- rmse(forecast44$mean, test44$sales)
rmse44
## [1] 9099.985

Store 45

train45.ts <- ts(train45$sales, frequency = 365)
test45.ts <- ts(test45$sales, frequency = 365)

# plot the time series
plot(train45, type = "l", main = "Time Series Plot of train data (Store 45)", xlab = "Date", ylab = "Sales")

# Plot ACF
acf(train45$sales, main = "ACF of Sales, train data (Store 45)", ylab = "Autocorrelation", xlab = "Lag")

# Plot PACF 
pacf(train45$sales, main='PACF of Sales, train data (Store 45)')

# applying SARIMA(pdq)(PDQ)

sarima45 <- auto.arima(train45$sales, seasonal = TRUE, trace = TRUE, approximation = TRUE)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,1,2) with drift         : 35313.28
##  ARIMA(0,1,0) with drift         : 36041
##  ARIMA(1,1,0) with drift         : 36020.72
##  ARIMA(0,1,1) with drift         : 35768.77
##  ARIMA(0,1,0)                    : 36039.01
##  ARIMA(1,1,2) with drift         : 35384.8
##  ARIMA(2,1,1) with drift         : 35351.28
##  ARIMA(3,1,2) with drift         : 35328.81
##  ARIMA(2,1,3) with drift         : 35097.09
##  ARIMA(1,1,3) with drift         : 35375.44
##  ARIMA(3,1,3) with drift         : 35240.46
##  ARIMA(2,1,4) with drift         : Inf
##  ARIMA(1,1,4) with drift         : 35348.01
##  ARIMA(3,1,4) with drift         : Inf
##  ARIMA(2,1,3)                    : Inf
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(2,1,3) with drift         : Inf
##  ARIMA(3,1,3) with drift         : 35259.92
## 
##  Best model: ARIMA(3,1,3) with drift
summary(sarima45)
## Series: train45$sales 
## ARIMA(3,1,3) with drift 
## 
## Coefficients:
##          ar1     ar2      ar3      ma1      ma2     ma3    drift
##       0.1248  0.2553  -0.4213  -0.5945  -0.7705  0.5113  14.6809
## s.e.  0.0511  0.0432   0.0351   0.0473   0.0449  0.0318  31.2816
## 
## sigma^2 = 82746787:  log likelihood = -17621.92
## AIC=35259.83   AICc=35259.92   BIC=35303.21
## 
## Training set error measures:
##                    ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 18.89377 9074.764 6702.101 -Inf  Inf 0.7509208 -0.04092634
checkresiduals(sarima45)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,3) with drift
## Q* = 407.85, df = 4, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 10
# Forecasting
forecast45 <- forecast(sarima45, h = 14)
plot(forecast45, main = "SARIMA Forecast from 2nd to 15th Aug, 2017 (Store 45)", xlab = "day", ylab = "Sales")
abline(h = mean(train45$sales), col = "red", lty = 2)

# Plot the test values
plot(test45, type = "l", col = "black", lwd = 2, ylim = range(c(test45.ts, forecast45$mean)), 
     main = "Comparison of Forecasted and Actual Values (Store 45)", xlab = "Day", ylab = "Sales")

# Add the forecasted values to the plot
# lines(x = seq(length(test45.ts) + 1, length(test45.ts) + length(forecast45$mean)),
#       y = forecast45$mean, col = "blue", lwd = 2)
lines(x = test45$date,
      y = forecast45$mean, col = "blue", lwd = 2)

# Add legend
legend("topright", legend = c("Actual", "Forecasted"), col = c("black", "blue"), lty = 1, lwd = 2)

# Add mean of training data as a reference line
abline(h = mean(train45$sales), col = "red", lty = 2)

# Plot the test values
plot(test45, type = "l", col = "black", lwd = 2, ylim = range(c(test45, forecast45$upper)), 
     main = "Comparison of Forecasted and Actual Values (Store 45)", xlab = "Day", ylab = "Sales")

# Add the forecasted values to the plot
lines(x = test45$date, y = forecast45$mean, col = "orange", lwd = 2)
lines(x = test45$date, y = forecast45$upper[1:14], col = "blue", lwd = 2)
lines(x = test45$date, y = forecast45$lower[1:14], col = "purple", lwd = 2)

# Add legend
legend("topright", legend = c("Actual", "Forecasted Mean", "Forecasted Upper", "Forecasted Lower"), col = c("black",  "orange", "blue", "purple"), lty = 1, lwd = 2)

abline(h = mean(train45$sales), col = "red", lty = 2)

# Install and load the Metrics package if you haven't already
library(Metrics)

# Calculate RMSE
rmse45 <- rmse(forecast45$mean, test45$sales)
rmse45
## [1] 7984.638

Store 47

train47.ts <- ts(train47$sales, frequency = 365)
test47.ts <- ts(test47$sales, frequency = 365)

# plot the time series
plot(train47, type = "l", main = "Time Series Plot of train data (Store 47)", xlab = "Date", ylab = "Sales")

# Plot ACF
acf(train47$sales, main = "ACF of Sales, train data (Store 47)", ylab = "Autocorrelation", xlab = "Lag")

# Plot PACF 
pacf(train47$sales, main='PACF of Sales, train data (Store 47)')

# applying SARIMA(pdq)(PDQ)

sarima47 <- auto.arima(train47$sales, seasonal = TRUE, trace = TRUE, approximation = TRUE)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,1,2) with drift         : 34980.23
##  ARIMA(0,1,0) with drift         : 35734.26
##  ARIMA(1,1,0) with drift         : 35720.5
##  ARIMA(0,1,1) with drift         : 35469.22
##  ARIMA(0,1,0)                    : 35732.26
##  ARIMA(1,1,2) with drift         : 35040.2
##  ARIMA(2,1,1) with drift         : 35001.59
##  ARIMA(3,1,2) with drift         : 34982.35
##  ARIMA(2,1,3) with drift         : 34975.92
##  ARIMA(1,1,3) with drift         : 35036.46
##  ARIMA(3,1,3) with drift         : 34906.46
##  ARIMA(4,1,3) with drift         : Inf
##  ARIMA(3,1,4) with drift         : Inf
##  ARIMA(2,1,4) with drift         : Inf
##  ARIMA(4,1,2) with drift         : Inf
##  ARIMA(4,1,4) with drift         : Inf
##  ARIMA(3,1,3)                    : 34904.71
##  ARIMA(2,1,3)                    : 34974.93
##  ARIMA(3,1,2)                    : 34981.23
##  ARIMA(4,1,3)                    : Inf
##  ARIMA(3,1,4)                    : Inf
##  ARIMA(2,1,2)                    : 34978.69
##  ARIMA(2,1,4)                    : Inf
##  ARIMA(4,1,2)                    : Inf
##  ARIMA(4,1,4)                    : Inf
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(3,1,3)                    : 34925.65
## 
##  Best model: ARIMA(3,1,3)
summary(sarima47)
## Series: train47$sales 
## ARIMA(3,1,3) 
## 
## Coefficients:
##          ar1     ar2      ar3      ma1      ma2     ma3
##       0.1721  0.1803  -0.3921  -0.6335  -0.6993  0.4738
## s.e.  0.0558  0.0468   0.0376   0.0518   0.0532  0.0326
## 
## sigma^2 = 67800206:  log likelihood = -17455.79
## AIC=34925.58   AICc=34925.65   BIC=34963.54
## 
## Training set error measures:
##                    ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 113.6987 8216.854 6214.637 -Inf  Inf 0.7762624 -0.03043194
checkresiduals(sarima47)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,3)
## Q* = 546.98, df = 4, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 10
# Forecasting
forecast47 <- forecast(sarima47, h = 14)
plot(forecast47, main = "SARIMA Forecast from 2nd to 15th Aug, 2017 (Store 47)", xlab = "day", ylab = "Sales")
abline(h = mean(train47$sales), col = "red", lty = 2)

# Plot the test values
plot(test47, type = "l", col = "black", lwd = 2, ylim = range(c(test47.ts, forecast47$mean)), 
     main = "Comparison of Forecasted and Actual Values (Store 47)", xlab = "Day", ylab = "Sales")

# Add the forecasted values to the plot
lines(x = test47$date,
      y = forecast47$mean, col = "blue", lwd = 2)

# Add legend
legend("topright", legend = c("Actual", "Forecasted"), col = c("black", "blue"), lty = 1, lwd = 2)

abline(h = mean(train47$sales), col = "red", lty = 2)

# Plot the test values
plot(test47, type = "l", col = "black", lwd = 2, ylim = range(c(test47, forecast47$upper)), 
     main = "Comparison of Forecasted and Actual Values (Store 47)", xlab = "Day", ylab = "Sales")

# Add the forecasted values to the plot
lines(x = test47$date, y = forecast47$mean, col = "orange", lwd = 2)
lines(x = test47$date, y = forecast47$upper[1:14], col = "blue", lwd = 2)
lines(x = test47$date, y = forecast47$lower[1:14], col = "purple", lwd = 2)

# Add legend
legend("topright", legend = c("Actual", "Forecasted Mean", "Forecasted Upper", "Forecasted Lower"), col = c("black",  "orange", "blue", "purple"), lty = 1, lwd = 2)

abline(h = mean(train47$sales), col = "red", lty = 2)

# Install and load the Metrics package if you haven't already
library(Metrics)

# Calculate RMSE
rmse47 <- rmse(forecast47$mean, test47$sales)
rmse47
## [1] 7290.442